library(GEOquery)
library(crayon)
library(tidyverse)
library(Seurat)
library(SeuratData)
library(Azimuth)
library(patchwork)
library(Matrix)
library(RCurl)
library(DoubletFinder)
library(scales)
library(SoupX)
library(cowplot)
library(metap)
library(SingleCellExperiment)
library(DropletUtils)
library(AnnotationHub)
library(HGNChelper)
library(ensembldb)
library(multtest)
library(glmGamPoi)
library(pbapply)
library(data.table)
library(ggrepel)
library(ggpubr)
library(stringr)
library(canvasXpress)
library(clinfun)
library(GGally)
library(factoextra)
library(DESeq2)
library(limma)
library(fgsea)
library(org.Mm.eg.db)
library(kableExtra)
# Specify path to supplementary files.
supp_file_path = "/Users/kendrix/Documents/Work/UnityHealth/Data_Projects/scRNAseq_LungFibrosisAnalysis/GSE250396/"
# Get unique sample names.
sample_names = list.files(path = paste0(supp_file_path, "raw_data"), pattern = "GSM", full.names = FALSE)
sample_names
## [1] "GSM7978625_d0" "GSM7978626_d14" "GSM7978627_d35"
# Create Seurat object for each sample.
for (file in sample_names){
seurat_data <- Read10X(data.dir = paste0(supp_file_path, "raw_data/", file))
seurat_obj <- CreateSeuratObject(counts = seurat_data$`Gene Expression`,
min.features = 100,
project = file)
assign(file, seurat_obj)
}
## 10X data contains more than one type and is being returned as a list containing matrices of each type.
## 10X data contains more than one type and is being returned as a list containing matrices of each type.
## 10X data contains more than one type and is being returned as a list containing matrices of each type.
# Remove unneeded objects from the environment.
rm(file, seurat_data, seurat_obj)
# Merge Seurat objects.
merged_seurat <- merge(x = GSM7978625_d0,
y = c(GSM7978626_d14,
GSM7978627_d35),
add.cell.id = c("Lung_Control_Day0",
"Lung_Bleomycin_Day14",
"Lung_Bleomycin_Day35"))
# Concatenate the count matrices of the samples together.
merged_seurat <- JoinLayers(merged_seurat)
# Remove individual Seurat objects after successful merge.
rm(list = ls(pattern = "GSM"))
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 15846150 846.3 25447820 1359.1 NA 25447820 1359.1
## Vcells 349644557 2667.6 1487126702 11345.9 1024000 1503986775 11474.6
# Pull GEO metadata.
GSE250396_meta <- getGEO(GEO = "GSE250396", GSEMatrix = TRUE)
## Found 1 file(s)
## GSE250396_series_matrix.txt.gz
GSE250396_meta <- pData(GSE250396_meta$GSE250396_series_matrix.txt.gz)
# Subset metadata.
GSE250396_meta <- GSE250396_meta %>% subset(select = c(geo_accession,
organism_ch1,
`cell type:ch1`,
`genotype:ch1`,
`time:ch1`,
`tissue:ch1`,
`treatment:ch1`))
# Rename columns to more accessible names.
colnames(GSE250396_meta) <- c("orig.ident",
"organism",
"cell_type",
"genotype",
"timepoint",
"tissue",
"treatment")
# Edit the orig.ident column to match Seurat object.
GSE250396_meta$orig.ident <- gsub("GSM7978625", "GSM7978625_d0", GSE250396_meta$orig.ident)
GSE250396_meta$orig.ident <- gsub("GSM7978626", "GSM7978626_d14", GSE250396_meta$orig.ident)
GSE250396_meta$orig.ident <- gsub("GSM7978627", "GSM7978627_d35", GSE250396_meta$orig.ident)
# Add quality metrics to the metadata.
# Count total percentage of mitochondrial genes.
merged_seurat <- PercentageFeatureSet(merged_seurat,
pattern = "^mt-",
col.name = "percent.mito")
# Count total percentage of ribosomal genes.
merged_seurat <- PercentageFeatureSet(merged_seurat,
pattern = "^rp[sl]",
col.name = "percent.ribo")
# Count total haemoglobin genes (but not HBP).
merged_seurat <- PercentageFeatureSet(merged_seurat,
pattern = "^hb[^(p)]",
col.name = "percent.globin")
We can merged the metadata from GEO and put the merged metadata back into the Seurat object.
# Get metadata from Seurat object.
metadata = merged_seurat@meta.data %>% rownames_to_column()
dim(metadata)
## [1] 103750 7
# Merge metadata with GEO metadata by the newly created orig.ident column.
metadata <- merge(metadata, GSE250396_meta, by = "orig.ident")
dim(metadata)
## [1] 103750 13
# Edit and retain only required columns.
metadata <- metadata %>% column_to_rownames()
# Generate quality metrics.
names(metadata)[names(metadata) == "nCount_RNA"] <- "nUMI"
names(metadata)[names(metadata) == "nFeature_RNA"] <- "nGene"
# Compute mitochondrial ratio.
metadata$mitoRatio = PercentageFeatureSet(object = merged_seurat, pattern = "^mt-")
metadata$mitoRatio = metadata$mitoRatio / 100
# Calculate novelty score.
metadata$log10GenesPerUMI <- log10(metadata$nGene) / log10(metadata$nUMI)
# Add the new metadata back to the Seurat object.
merged_seurat@meta.data <- metadata
# Remove unneeded objects.
rm(GSE250396_meta, metadata, sample_names)
Check cell counts with number of unique barcodes detected to determine capture efficiency.
# Extract metadata from Seurat object.
metadata = merged_seurat@meta.data
# Visualize cell counts.
metadata %>%
ggplot(aes(x = timepoint, fill = timepoint)) +
geom_bar() +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
ggtitle("NCells")
The UMI counts per cell should generally be above 500, that is the low end of what we expect. If UMI counts are between 500-1000 counts, it is usable but the cells probably should have been sequenced more deeply.
# Visualize UMI counts per cell.
metadata %>%
ggplot(aes(x = nUMI, color = timepoint, fill = timepoint)) +
geom_density(alpha = 0.2) +
scale_x_log10(labels = label_number()) +
ylab("Cell density") +
geom_vline(xintercept = 500, linetype = "dashed", color = "red") +
theme_classic()
# Compare UMI counts and gene counts per cell.
VlnPlot(merged_seurat, features = c("nUMI", "nGene"), alpha = 0.2, raster = FALSE)
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
Generally, we expect the novelty score to be above 0.80 for good quality cells.
# Visualize novelty score.
metadata %>%
ggplot(aes(x = log10GenesPerUMI, color = timepoint, fill = timepoint)) +
geom_density(alpha = 0.2) +
geom_vline(xintercept = 0.8, linetype = "dashed", color = "red") +
theme_classic()
In general, poor quality samples have mitochondrial and ribosomal ratio higher than 0.2 mark.
# Visualize mitochondrial ratio.
metadata %>%
ggplot(aes(x = mitoRatio, color = timepoint, fill = timepoint)) +
geom_density(alpha = 0.2) +
scale_x_log10() +
geom_vline(xintercept = 0.2, linetype = "dashed", color = "red") +
theme_classic()
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_density()`).
We can also use Seurat’s native function to visualize mito, ribo and haemoglobin genes in the dataset.
# Visualize all three metrics.
VlnPlot(merged_seurat, features = c("percent.mito", "percent.ribo", "percent.globin"), raster = FALSE)
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
## Warning in SingleExIPlot(type = type, data = data[, x, drop = FALSE], idents =
## idents, : All cells have the same value of percent.ribo.
## Warning in SingleExIPlot(type = type, data = data[, x, drop = FALSE], idents =
## idents, : All cells have the same value of percent.globin.
Considering any of these QC metrics in isolation can lead to misinterpretation of cellular signals. Jointly visualizing the count and gene thresholds and additionally overlaying the mitochondrial fraction, gives a summarized persepective of the quality per cell.
# Visualize joint filtering QCs.
metadata %>%
ggplot(aes(x = nUMI, y = nGene, color = mitoRatio)) +
geom_point() +
scale_colour_gradient(low = "gray90", high = "black") +
stat_smooth(method = lm) +
scale_x_log10() +
scale_y_log10() +
geom_vline(xintercept = 500) +
geom_hline(yintercept = 250) +
theme_classic() +
facet_wrap(~timepoint)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
# Rank expression barcodes.
br.out = barcodeRanks(merged_seurat@assays$RNA$counts)
# Plot knee plot.
plot(br.out$rank, br.out$total, log = "xy", xlab = "Rank", ylab = "Total")
abline(h = metadata(br.out)$knee, col = "dodgerblue", lty = 2)
abline(h = metadata(br.out)$inflection, col = "forestgreen", lty = 2)
legend("bottomleft", lty = 2, col = c("dodgerblue", "forestgreen"), legend = c("knee", "inflection"))
# Remove unneeded object.
rm(br.out)
Now that we have visualized the various metrics, we can decide on the thresholds to apply which will result in the removal of low quality cells. We will use the following thresholds:
nUMI > 500
nGene > 250
log10GenesPerUMI > 0.8
mitoRatio < 0.2
# Get pre-filtering cell counts.
dim(merged_seurat)[2] # 103,750 cells
## [1] 103750
# Low-quality cell filtering.
umi_filtered <- subset(x = merged_seurat, subset = nUMI >= 500)
gene_filtered <- subset(x = umi_filtered, subset = nGene >= 250)
genesperumi_filtered <- subset(x = gene_filtered, subset = log10GenesPerUMI > 0.80)
mitoratio_filtered <- subset(x = genesperumi_filtered, subset = mitoRatio < 0.20)
# Get post-filtering Seurat object.
# Enter level of filtering desired here.
cell_filtered_seurat = mitoratio_filtered
# Get counts after low-quality cell filtering.
dim(cell_filtered_seurat)[2] # 101,444 cells.
## [1] 101444
We can visualize the amount of cells removed and the number of cells remaining for each filtering step in the plot below.
We also remove genes that have zero expression in all cells and genes that are only expressed in 10 or less cells as genes that are expressed in only a handful of cells are not meaningful and can bring down the averages for all the other cells that they are not expressed in.
# Extract count matrix.
counts <- GetAssayData(object = cell_filtered_seurat, layer = "counts")
# Get a logical vector for every gene that has more than zero counts per cell.
nonzero <- counts > 0
# Get a logical vector of genes that are expressed in 10 or more cells.
keep_genes <- Matrix::rowSums(nonzero) >= 10
# Subset counts to keep genes that are non-zero and expressed in 10 or more cells.
filtered_counts <- counts[keep_genes, ]
# Create new Seurat object with the subsetted counts.
filtered_seurat <- CreateSeuratObject(filtered_counts, meta.data = cell_filtered_seurat@meta.data)
We can visualize the amount of genes removed and the number of genes remaining for each filtering step in the plot below.
Let’s have a look of our expression distribution before normalization.
# Visualize expression distribution before normalization.
hist(colSums(filtered_seurat[["RNA"]]$counts),
breaks = 100,
col = "#212226",
main = "Pre-Normalization: Expression distribution",
xlab = "Sum of expression")
Before we make any comparisons across cells, we will apply a simple normalization. This is solely for the purpose of exploring the sources of variation in our data.
# Apply simple normalization to merged Seurat objects.
seurat_phase <- NormalizeData(filtered_seurat)
## Normalizing layer: counts
# Visualize expression distribution after simple normalization.
hist(colSums(seurat_phase[["RNA"]]$data),
breaks = 100,
col = "#b8520a",
main = "Post-Simple Normalization: Expression distribution",
xlab = "Sum of expression")
# Remove unneeded objects.
rm(filtered_seurat)
# Download cell cycle genes for organism at https://github.com/hbc/tinyatlas/tree/master/cell_cycle.
cc_file <- getURL("https://raw.githubusercontent.com/hbc/tinyatlas/master/cell_cycle/Mus_musculus.csv")
cell_cycle_genes <- read.csv(text = cc_file)
# Remove unneeded objects.
rm(cc_file)
All of the cell cycle genes are Ensembl IDs, but our gene IDs are the gene names. To score the genes in our count matrix for cell cycle, we need to obtain the gene names for the cell cycle genes.
# Connect to AnnotationHub.
ah <- AnnotationHub()
# Access the Ensembl database for organism.
ahDb <- query(ah,
pattern = c("Mus musculus", "EnsDb"),
ignore.case = TRUE)
# Acquire the latest annotation files.
id <- ahDb %>%
mcols() %>%
rownames() %>%
tail(n = 1)
# Download the appropriate Ensembldb database.
edb <- ah[[id]]
## loading from cache
# Extract gene-level information from database.
annotations <- genes(edb,
return.type = "data.frame")
# Select annotations of interest.
annotations <- annotations %>%
dplyr::select(gene_id, gene_name, seq_name, gene_biotype, description)
# Remove unneeded objects.
rm(ah, ahDb, edb, id)
We can merge the Ensembl genes to the cell cycle genes.
# Get gene names for Ensembl IDs for each gene.
cell_cycle_markers <- dplyr::left_join(cell_cycle_genes, annotations, by = c("geneID" = "gene_id"))
# Acquire the S phase genes.
s_genes <- cell_cycle_markers %>%
dplyr::filter(phase == "S") %>%
pull("gene_name")
# Acquire the G2M phase genes.
g2m_genes <- cell_cycle_markers %>%
dplyr::filter(phase == "G2/M") %>%
pull("gene_name")
# Remove unneeded objects.
rm(cell_cycle_genes, cell_cycle_markers, annotations)
We can then use the annotated cell cycle genes to perform cell cycle scoring.
# Perform cell cycle scoring
seurat_phase <- CellCycleScoring(seurat_phase,
g2m.features = g2m_genes,
s.features = s_genes)
# Remove unneeded objects.
rm(g2m_genes, s_genes)
# Identify the most variable genes.
seurat_phase <- FindVariableFeatures(seurat_phase,
selection.method = "vst",
nfeatures = 2000,
verbose = FALSE)
# Scale the counts.
seurat_phase <- ScaleData(seurat_phase)
## Centering and scaling data matrix
# Visualize expression distribution after scaling.
hist(colSums(seurat_phase[["RNA"]]$scale.data),
breaks = 100,
col = "#779e7f",
main = "Post-Scaling: Expression distribution",
xlab = "Sum of expression")
# Identify the 15 most highly variable genes.
ranked_variable_genes <- VariableFeatures(seurat_phase)
top_genes <- ranked_variable_genes[1:15]
# Plot the average expression and variance of these genes with labels to indicate which genes are in the top 15.
p <- VariableFeaturePlot(seurat_phase)
LabelPoints(plot = p, points = top_genes, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
# Remove unneeded objects.
rm(ranked_variable_genes, top_genes, p)
# Perform PCA.
seurat_phase <- RunPCA(seurat_phase)
## PC_ 1
## Positive: Nfib, Dlc1, Ghr, Ptprg, Timp3, Ptprm, Prex2, Nckap5, Sparc, Epas1
## Ptprk, Rhoj, Ldb2, Ptprb, Hmcn1, Pde3a, Prickle2, Adgrf5, Cyyr1, Tmtc2
## Rbms3, Prkg1, Fermt2, Calcrl, 9530026P05Rik, Sema3c, Bmpr2, Scn7a, Meis1, Lhfp
## Negative: Spp1, Ctss, Lyz2, Ctsd, Ccl6, Fth1, Tyrobp, Fabp5, Fcer1g, Cybb
## Cyba, Lgals3, Mpeg1, Wfdc17, Psap, Clec4n, Trem2, Acp5, Cd68, Ftl1
## F7, Atp6v0d2, Itgax, Ctsz, Tbxas1, Pld3, Rbm47, Tbc1d9, Rnf149, Wfdc21
## PC_ 2
## Positive: Col1a2, Plxdc2, Gsn, Col3a1, Serping1, Lama2, Col1a1, Rbms3, Bicc1, Fbn1
## Adamts2, Bgn, Pdzrn3, C3, Fth1, Ogn, Lgals1, Dpt, Pid1, Cped1
## Dcn, Cfh, Mgp, Aebp1, Loxl1, Col6a1, Abca8a, Mfap5, Zfpm2, Fbln1
## Negative: Calcrl, Cyyr1, Adgrf5, Slco2a1, Cdh5, Tspan7, Galnt18, Adgrl4, Ptprb, Podxl
## Cldn5, Egfl7, Samd12, Epas1, Ldb2, Clic5, Afap1l1, Shank3, Ptprm, Eng
## Pcdh17, Cd93, Ccdc85a, Tek, Ramp2, Acer2, Bmp6, Sema6a, Ptprg, Nckap5
## PC_ 3
## Positive: Mertk, Cd9, Tcf7l2, Lrmda, Atp6v0d2, Lpl, Mrc1, Lyz2, Dmxl2, Mctp1
## Mitf, Olr1, Plin2, Lrp12, Clec4n, Ftl1, Mpeg1, Ccl6, Ctsd, Tbxas1
## Lgals3, Dapk1, F7, Nceh1, Abcg1, Kcnn3, Plet1, Tbc1d9, Slc7a2, Plcb1
## Negative: Inpp4b, Gramd3, Ptpn22, Skap1, Bank1, Ighm, Itk, Gm2682, Igkc, Kcnq5
## Grap2, Txk, Lef1, Ms4a4b, Sidt1, Themis, Cd247, Bcl11b, Camk4, Ly6d
## Ccr7, Cmah, Tox, Cd28, Trbc2, Pou2af1, Dusp10, Il18r1, Iglc2, Il2rb
## PC_ 4
## Positive: Skap1, Itk, Gm2682, Grap2, Cd247, Themis, Bcl11b, Camk4, Gramd3, Txk
## Ptpn22, Lef1, Cd28, Ms4a4b, Tox, Atp10a, Trbc2, Il7r, Kcnn3, Sidt1
## Atp6v0d2, C530008M17Rik, Icos, Dusp10, Acp5, Pld3, Il2rb, Trbc1, Mertk, Nol4l
## Negative: Il1b, S100a9, Il1r2, Csf3r, Hdc, S100a8, Slc7a11, Acod1, Clec4d, Retnlg
## Mmp9, Cxcr2, Entpd1, Trem3, Ptgs2, Nlrp3, Arg2, G0s2, Antxr2, Cxcl2
## Clec4e, Hp, Sgms2, Ifitm1, Adam8, Cass4, Rnf149, Stfa2l1, Lcn2, Il1f9
## PC_ 5
## Positive: Apoe, Spp1, Mmp2, C1qb, C1qa, Mmp14, C1qc, Mfap5, Fbln1, Dpt
## Cygb, Igf1, Serpinf1, Col14a1, Serping1, Abca8a, Wfdc17, Clec3b, Scara5, Nid1
## Lpar1, Cfh, C3ar1, Mafb, Ctsl, Pdgfra, Cd34, Lum, Emp1, Gas7
## Negative: Ank3, Thsd4, Wwc1, Wfdc2, Sftpb, Cbr2, Myh11, Ppp1r14c, Sftpa1, Krt8
## Sec14l3, Ptprf, Myh14, Sftpd, Sftpc, Acta2, Cldn18, Slc34a2, Cldn3, Nav2
## Tagln, Cxcl15, Prdm16, Ctnna3, Arhgef38, Kcnip4, Cdh1, Myo5b, Dmd, Lmod1
# Plot the PCA colored by cell cycle phase.
DimPlot(seurat_phase,
reduction = "pca",
group.by = "Phase",
split.by = "Phase")
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
# Split effects of PCA by agedness
DimPlot(seurat_phase,
reduction = "pca",
group.by = "Phase",
split.by = "timepoint")
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
We don’t see large differences due to the effects of cell cycle, so cell cycle effects are not regressed out.
# Free up memory.
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 16352587 873.4 25447820 1359.1 NA 25447820 1359.1
## Vcells 877935136 6698.2 1570959530 11985.5 1024000 1954914125 14914.9
# Set memory limit.
options(future.globals.maxSize = 5000 * 1024^2)
# Perform SCTransform normalization.
seurat_sct <- SCTransform(seurat_phase,
vars.to.regress = c("mitoRatio"),
vst.flavor = "v2",
conserve.memory = TRUE,
verbose = FALSE)
## Will not return corrected UMI because residual type is not set to 'pearson'
# Compare the objects before and after SCT transformation.
seurat_phase # Look at the feature counts before transformation.
## An object of class Seurat
## 23340 features across 101444 samples within 1 assay
## Active assay: RNA (23340 features, 2000 variable features)
## 3 layers present: counts, data, scale.data
## 1 dimensional reduction calculated: pca
seurat_sct # Look at the feature counts after transformation.
## An object of class Seurat
## 46680 features across 101444 samples within 2 assays
## Active assay: SCT (23340 features, 3000 variable features)
## 3 layers present: counts, data, scale.data
## 1 other assay present: RNA
## 1 dimensional reduction calculated: pca
# Check which objects are stored in which assays.
seurat_sct@assays
## $RNA
## Assay (v5) data with 23340 features for 101444 cells
## Top 10 variable features:
## Jchain, Igkc, Ighm, Sftpc, Retnla, Scgb1a1, Igha, Gm26870, Sftpa1, Saa3
## Layers:
## counts, data, scale.data
##
## $SCT
## SCTAssay data with 23340 features for 101444 cells, and 1 SCTModel(s)
## Top 10 variable features:
## Hbb-bs, Hba-a1, Hba-a2, Gm26870, Hbb-bt, S100a9, Igkc, Scgb1a1, S100a8,
## Sftpc
# Remove unneeded objects.
rm(seurat_phase)
The SCTransform normalized counts are stored in the SCT$counts slot. Here is the distribution of the SCTransform normalized counts distribution.
# Visualize expression distribution after scaling.
hist(colSums(seurat_sct[["SCT"]]$counts),
breaks = 100,
col = "#332f42",
main = "Post-SCTransform: Expression distribution",
xlab = "Sum of expression")
The log-normalized SCTransfromed counts are also stored in the SCT$data slot. Here is the distribution of the log-normalized SCTransformed counts.
hist(colSums(seurat_sct[["SCT"]]$data),
breaks = 100,
col = "#b14b34",
main = "Post-SCTransformed: Log-normalized expression distribution",
xlab = "Sum of expression")
The goal of this step is to perform unsupervised clustering to identify groups of cells based on similarities of their transcriptomes without any prior knowledge of the labels or the number of clusters. This can be challenging due to the high level of noise and large number of dimensions, so it is often beneficial to apply a dimensionality reduction method to reduce the amount of noise.
# Run PCA.
seurat_sct <- RunPCA(seurat_sct, verbose = FALSE)
# Visualize dimension reduction from PCA.
seurat_sct <- RunUMAP(seurat_sct, dims = 1:40, reduction = "pca")
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 22:55:36 UMAP embedding parameters a = 0.9922 b = 1.112
## 22:55:36 Read 101444 rows and found 40 numeric columns
## 22:55:36 Using Annoy for neighbor search, n_neighbors = 30
## 22:55:36 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 22:55:52 Writing NN index file to temp file /var/folders/1c/sbnd3yvd7mzg5lng1jcr70tw0000gn/T//RtmptpeH1F/file9cd469cdfa48
## 22:55:52 Searching Annoy index using 1 thread, search_k = 3000
## 22:56:33 Annoy recall = 100%
## 22:56:35 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 22:56:38 Initializing from normalized Laplacian + noise (using RSpectra)
## 22:56:41 Commencing optimization for 200 epochs, with 4712486 positive edges
## 22:57:27 Optimization finished
DimPlot(object = seurat_sct, group.by = "timepoint")
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
# Explore the top PC markers.
DimHeatmap(seurat_sct,
dims = 1:9,
cells = 500,
balanced = TRUE)
We can visualize an elbow plot to see how many PCs to use for clustering.
# Build the elbow plot.
# Determine percent of variation associated with each PC.
pct <- seurat_sct[["pca"]]@stdev / sum(seurat_sct[["pca"]]@stdev) * 100
# Calculate cumulative percents for each PC.
cumu <- cumsum(pct)
# Determine which PC exhibits cumulative percent greater than 90% and % variation associated with the PC as less than 5.
co1 <- which(cumu > 90 & pct < 5)[1]
co1
## [1] 41
# Determine the difference between variation of PC and subsequent PC.
co2 <- sort(which((pct[1:length(pct) - 1] - pct[2:length(pct)]) > 0.1), decreasing = T)[1] + 1
# Last point where change of % of variation is more than 0.1%.
co2
## [1] 20
# Minimum of the two calculation.
pcs <- min(co1, co2)
pcs
## [1] 20
# Create a dataframe with values.
elbowplot_df <- data.frame(pct = pct,
cumu = cumu,
rank = 1:length(pct))
# Elbow plot to visualize.
ggplot(elbowplot_df, aes(cumu, pct, label = rank, color = rank > pcs)) +
geom_text() +
geom_vline(xintercept = 90, color = "grey") +
geom_hline(yintercept = min(pct[pct > 5]), color = "grey") +
theme_bw()
# Remove unneeded objects.
rm(co1, co2, cumu, elbowplot_df, pcs, pct)
Based on the elbow plot, we will use approx. 20 PCs to generate the clusters.
The basic steps towards complete clustering involve:
Measure of similarity: How does one quantify how close two data points are?
Quality function: How does one define the clustering/partition of the points?
Algorithm: How to find the clustering whereby the quality function is optimized?
We use a graph-based community detection method where each vertex represents a cell and the weight of the edge measures similarities between two cells. Put simply, we start by building an unweighted K-nearest neighbour (k-NN) graph and add weights to obtain a shared nearest neighbour (SNN) graph. Weight can be added either by finding out the shared nodes (overlap) between two neighbours (more nodes; more similar the neighbours; more weight) or measuring the closeness of both neighbours to each other.
# Compute SNN graph.
seurat_cluster <- FindNeighbors(object = seurat_sct, dims = 1:20) # Based on elbow plot.
## Computing nearest neighbor graph
## Computing SNN
# Remove unneeded object.
rm(seurat_sct)
Next, we want to iteratively group the neighbours together with the goal of combining the cells with similar transcriptomes together. Without knowing the number of clusters a priori, we use a range of resolution based on the approximation of the recommendation where datasets of 3,000 - 5,000 cells should set the resolution between 0.4 - 1.4. We have approximately 100,000 cells, so our range will start from 0.2 - 1.0.
# Determine start and end resolution range.
start_reso_range = 0.2
end_reso_range = 1
range_interval = 0.2
# Find clusters given the range.
seurat_cluster <- FindClusters(object = seurat_cluster,
resolution = seq(from = start_reso_range,
to = end_reso_range,
by = range_interval))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 101444
## Number of edges: 3208830
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9540
## Number of communities: 18
## Elapsed time: 48 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 101444
## Number of edges: 3208830
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9376
## Number of communities: 25
## Elapsed time: 46 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 101444
## Number of edges: 3208830
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9261
## Number of communities: 33
## Elapsed time: 47 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 101444
## Number of edges: 3208830
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9161
## Number of communities: 37
## Elapsed time: 43 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 101444
## Number of edges: 3208830
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9086
## Number of communities: 42
## Elapsed time: 47 seconds
# Remove unneeded objects.
rm(start_reso_range, end_reso_range, range_interval)
We can look at the maximum number of clusters found at each resolution with the output below.
# Create a dummy table to contain the max cluster for each resolution.
maxclust = c()
clustoutput = seurat_cluster@meta.data %>% subset(select = grep("SCT_snn_", colnames(.)))
clustoutput[] <- lapply(clustoutput, function(x) as.numeric(as.character(x)))
# Iterate and print the maximum cluster count for each resolution.
for (i in 1:ncol(clustoutput)){
maxclust[i] = max(clustoutput[, i])
print(paste0("Max n-cluster for ",
names(clustoutput[i]), ": ",
maxclust[i]))
}
## [1] "Max n-cluster for SCT_snn_res.0.2: 17"
## [1] "Max n-cluster for SCT_snn_res.0.4: 24"
## [1] "Max n-cluster for SCT_snn_res.0.6: 32"
## [1] "Max n-cluster for SCT_snn_res.0.8: 36"
## [1] "Max n-cluster for SCT_snn_res.1: 41"
# Remove unneeded objects.
rm(clustoutput, i, maxclust)
We can visualize the difference clusterings of the resolutions side by side. In this case, we will look at these resolutions: 0.2, 0.4, 0.6.
# Select resolutions to visualize.
resolutions = c("0.2", "0.4", "0.6")
feature_cols = paste0("SCT_snn_res.", resolutions)
# Visualize UMAP of different resolutions.
for (reso in feature_cols) {
Idents(seurat_cluster) <- reso
p <- DimPlot(seurat_cluster,
reduction = "umap",
label = TRUE,
label.size = 3,
raster = FALSE) +
ggtitle(reso) +
NoLegend()
print(p)
}
# Remove unneeded objects.
rm(feature_cols, p, reso, resolutions)
Let’s start with the exploration with the lowest resolution in the range, 0.2.
# Sort the cluster labels in ascending order.
reso_columns = grep("SCT_snn_", colnames(seurat_cluster@meta.data), value = TRUE)
metadata <- seurat_cluster@meta.data
for (col in colnames(metadata)) {
if (col %in% reso_columns){
metadata[, col] <- factor(metadata[, col], levels = paste(sort(as.integer(levels(metadata[, col])))))
}
}
seurat_cluster@meta.data <- metadata
# Assign desired cluster resolution as identity of Seurat.
Idents(object = seurat_cluster) <- "SCT_snn_res.0.2"
# Visualize cluster with a UMAP.
DimPlot(seurat_cluster,
reduction = "umap",
label = TRUE,
label.size = 3) +
ggtitle("SCT_snn_res.0.2") +
NoLegend()
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
# Remove unneeded objects.
rm(col, metadata, reso_columns)
# Extract number of cells per cluster.
n_cells <- FetchData(seurat_cluster,
vars = c("ident", "treatment")) %>%
dplyr::count(ident, treatment)
# Visualize distribution of cells by cluster in a barplot.
ggplot(n_cells, aes(x = ident, y = n, fill = treatment)) +
geom_bar(position = position_dodge(),
stat = "identity") +
geom_text(aes(label = n),
size = 2,
vjust = -0.2,
position = position_dodge(1)) +
theme_classic() +
theme(axis.text.x = element_text(size = 5))
# Visualize distribution of treatment in each cluster.
DimPlot(seurat_cluster,
label = TRUE,
split.by = "treatment",
label.size = 3,
raster = FALSE) +
theme(axis.text.x = element_text(size = 6),
axis.text.y = element_text(size = 6)) +
NoLegend()
# Remove unneeded object.
rm(n_cells)
We would like to see if there is any enrichment of mitochondria in our clusters and if we need to regress out the mitochondrial level in our data.
# Plot mitoRatio to see if any cluster is enriched in mitochondrial level.
FeaturePlot(seurat_cluster,
reduction = "umap",
features = "mitoRatio",
cols = c("#d6cfc7", "#01013f"),
pt.size = 0.5,
order = TRUE,
min.cutoff = "q10",
label = TRUE,
label.size = 3,
label.color = "#960000")
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
If our cell clusters show large differences in cell cycle expression, we might want to regress out the effects of cell cycle phase.
# Explore whether clusters segregate by cell cycle phase.
DimPlot(seurat_cluster,
label = TRUE,
split.by = "Phase",
label.size = 4) +
theme(axis.text.x = element_text(size = 6),
axis.text.y = element_text(size = 6)) +
NoLegend()
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
We don’t see any noticeable cell cycle phase effects in our clusters.
We can explore if any specific PCs drive the separation of the clusters.
# Define columns in the metadata to visualize.
columns = c(paste0("PC_", 1:12), "ident", "umap_1", "umap_2")
# Extract data from Seurat object based on column names.
pc_data <- FetchData(seurat_cluster, vars = columns)
# Add cluster label to the centre of cluster on UMAP.
umap_label = FetchData(seurat_cluster,
vars = c("ident", "umap_1", "umap_2")) %>%
group_by(ident) %>%
summarise(x = mean(umap_1), y = mean(umap_2))
# Plot a UMAP plot for each of the PCs.
map(paste0("PC_", 1:12), function(pc){
ggplot(pc_data, aes(umap_1, umap_2)) +
geom_point(aes_string(color = pc), alpha = 0.7) +
scale_color_gradient(guide = "none", low = "#d6cfc7", high = "#01013f") +
geom_text(data = umap_label, aes(label = ident, x, y), color = "#960000", size = 4) +
theme_classic() +
ggtitle(pc)}) %>%
plot_grid(plotlist = .)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Remove unneeded objects.
rm(columns, pc_data, umap_label)
We can print the genes driving different PCs and correlate those metagenes with the identity of the clusters.
# Print PC metagenes.
print(seurat_cluster[["pca"]], dims = 1:12, nfeatures = 5)
## PC_ 1
## Positive: Lyz2, Spp1, Fth1, Ftl1, Ctsd
## Negative: Mgp, Gsn, Dcn, Calcrl, Ptprm
## PC_ 2
## Positive: Lyz2, Mgp, Spp1, Fth1, Gsn
## Negative: Bank1, Ebf1, Arhgap15, Bach2, Satb1
## PC_ 3
## Positive: Lyz2, Spp1, Ctsd, Apoe, Ftl1
## Negative: S100a9, S100a8, Il1r2, Il1b, Retnlg
## PC_ 4
## Positive: Hbb-bs, Hba-a1, Hba-a2, Hbb-bt, Mgp
## Negative: Calcrl, Ptprm, Cyyr1, Prickle2, Sema3c
## PC_ 5
## Positive: Hbb-bs, Hba-a1, Hba-a2, Calcrl, Hbb-bt
## Negative: Mgp, Gsn, Dcn, Col3a1, Col1a2
## PC_ 6
## Positive: Acta2, Tagln, Myh11, Dmd, Tpm2
## Negative: Dcn, Gsn, Col3a1, Clec3b, Col1a2
## PC_ 7
## Positive: Ccl5, Skap1, Gm2682, Itk, Gzma
## Negative: Cd74, Ebf1, Bank1, Igkc, Ighm
## PC_ 8
## Positive: Apoe, Spp1, C1qa, C1qb, C1qc
## Negative: Kcnip4, Chil3, Sftpc, Scgb1a1, Nav2
## PC_ 9
## Positive: Ebf1, Chil3, Bank1, Kcnip4, Spp1
## Negative: Sftpc, Sftpa1, Ank3, Sftpb, Slc34a2
## PC_ 10
## Positive: Cd74, Cst3, H2-Eb1, H2-Aa, H2-Ab1
## Negative: Spp1, Lyz2, Sftpc, Ctsd, Fth1
## PC_ 11
## Positive: Dcn, Igfbp5, Gpc6, Opcml, Ebf2
## Negative: Mgp, Limch1, Inmt, Npnt, Slit2
## PC_ 12
## Positive: Lef1, Grap2, Itk, Themis, Emb
## Negative: Ccl5, Gzma, Igkc, Jchain, Il12rb2
The SCTransformed data was performed only on the 3000 most variable genes. Ideally, we want to explore the identity of the clusters with all the genes in the dataset and not just the top 3000 most variable genes, so we need to set the default assay back to the RNA assay.
# Set default assay back to RNA.
DefaultAssay(seurat_cluster) <- "RNA"
We can explore canonical cell type markers of our cell types of interest and visualize their expression on our UMAP. Canonical markers are obtained from LungMAP and A molecular cell atlas of the human lung from single-cell RNA sequencing (2020) Nature
AT1 cells are defined by the cell markers: Hopx, Ager, Rtkn2, Pdpn and Clic5. We can visualize both the UMAP and Dotplot to see which cluster(s) do these markers localize.
# Visualize AT1 markers expression on dotplot.
DotPlot(seurat_cluster,
features = c("Hopx", "Ager", "Rtkn2", "Pdpn", "Clic5"),
cluster.idents = TRUE,
cols = c("#d6cfc7", "#01013f"),
dot.scale = 6,
scale.by = "radius") +
ggtitle("AT1 markers") +
theme(plot.title = element_text(hjust = 0.5))
AT2 cells are defined by the cell markers: Sftpb, Sftpc, Sftpd, Muc1, Etv5 and Lamp3.
# Visualize AT2 markers expression on dotplot.
DotPlot(seurat_cluster,
features = c("Sftpb", "Sftpc", "Sftpd", "Muc1", "Etv5", "Lamp3"),
cluster.idents = TRUE,
cols = c("#d6cfc7", "#01013f"),
dot.scale = 6,
scale.by = "radius") +
ggtitle("AT2 markers") +
theme(plot.title = element_text(hjust = 0.5))
Alveolar fibroblast cells are defined by the cell markers: Pdgfra, Tcf21, Wnt2, Mfap5, Col1a1.
Pericytes are defined by the cell markers: Pdgfrb, Trpc6, Cspg4.
# Visualize alveolar fibroblast markers expression on dotplot.
DotPlot(seurat_cluster,
features = c("Tcf21", "Wnt2", "Mfap5", "Pdgfra", "Pdgfrb", "Trpc6", "Col1a1"),
cluster.idents = TRUE,
cols = c("#d6cfc7", "#01013f"),
dot.scale = 6,
scale.by = "radius") +
ggtitle("Fibroblast markers") +
theme(plot.title = element_text(hjust = 0.5))
Myofibroblast cells are defined by the cell markers: Wt1, Fgf18, Dach2, Frem2, Eln, Acta2.
# Visualize myofibroblast markers expression on dotplot.
DotPlot(seurat_cluster,
features = c("Wt1", "Fgf18", "Dach2", "Frem2", "Acta2"),
cluster.idents = TRUE,
cols = c("#d6cfc7", "#01013f"),
dot.scale = 6,
scale.by = "radius") +
ggtitle("Myofibroblast markers") +
theme(plot.title = element_text(hjust = 0.5))
Azimuth uses a reference-based mapping with the Human Biomolecular Atlas Project, containing references for multiple human tissues.
# Check all available Azimuth references.
available_data <- AvailableData()
available_data[grep("Azimuth", available_data[, 3]), 1:3] # Use lungref.
## Dataset Version Summary
## adiposeref.SeuratData adiposeref 1.0.0 Azimuth Reference: adipose
## bonemarrowref.SeuratData bonemarrowref 1.0.0 Azimuth Reference: bonemarrow
## fetusref.SeuratData fetusref 1.0.0 Azimuth Reference: fetus
## heartref.SeuratData heartref 1.0.0 Azimuth Reference: heart
## humancortexref.SeuratData humancortexref 1.0.0 Azimuth Reference: humancortex
## kidneyref.SeuratData kidneyref 1.0.2 Azimuth Reference: kidney
## lungref.SeuratData lungref 2.0.0 Azimuth Reference: lung
## mousecortexref.SeuratData mousecortexref 1.0.0 Azimuth Reference: mousecortex
## pancreasref.SeuratData pancreasref 1.0.0 Azimuth Reference: pancreas
## pbmcref.SeuratData pbmcref 1.0.0 Azimuth Reference: pbmc
## tonsilref.SeuratData tonsilref 2.0.0 Azimuth Reference: tonsil
# Run Azimuth with lungref.
options(timeout = 1000)
seurat_annotated <- RunAzimuth(seurat_cluster, reference = "lungref", assay = "RNA")
## Warning: Overwriting miscellanous data for model
## Warning: Adding a dimensional reduction (refUMAP) without the associated assay
## being present
## detected inputs from MOUSE with id type Gene.name
## reference rownames detected HUMAN with id type Gene.name
## Found 16201 out of 23340 total inputs in conversion table
## Normalizing query using reference SCT model
## Warning: No layers found matching search pattern provided
## Warning: 849 features of the features specified were not present in both the reference query assays.
## Continuing with remaining 2151 features.
## Projecting cell embeddings
## Finding query neighbors
## Finding neighborhoods
## Finding anchors
## Found 15178 anchors
## Finding integration vectors
## Finding integration vector weights
## Predicting cell labels
## Predicting cell labels
## Predicting cell labels
## Predicting cell labels
## Predicting cell labels
## Predicting cell labels
##
## Integrating dataset 2 with reference dataset
## Finding integration vectors
## Integrating data
## Warning: Keys should be one or more alphanumeric characters followed by an
## underscore, setting key from integrated_dr_ to integrateddr_
## Computing nearest neighbors
## Running UMAP projection
## Warning in RunUMAP.default(object = neighborlist, reduction.model =
## reduction.model, : Number of neighbors between query and reference is not equal
## to the number of neighbors within reference
## 23:11:02 Read 101444 rows
## 23:11:02 Processing block 1 of 1
## 23:11:02 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 20
## 23:11:02 Initializing by weighted average of neighbor coordinates using 1 thread
## 23:11:02 Commencing optimization for 67 epochs, with 2028880 positive edges
## 23:11:13 Finished
## Warning: No assay specified, setting assay as RNA by default.
## Projecting reference PCA onto query
## Finding integration vector weights
## Projecting back the query cells into original PCA space
## Finding integration vector weights
## Computing scores:
## Finding neighbors of original query cells
## Finding neighbors of transformed query cells
## Computing query SNN
## Determining bandwidth and computing transition probabilities
## Total elapsed time: 1.17997138500214
# Remove unneeded object.
rm(available_data)
# Assign desired cluster resolution as identity of Seurat.
Idents(object = seurat_annotated) <- "predicted.ann_finest_level"
# Visualize annotated cluster with a UMAP.
DimPlot(seurat_annotated,
reduction = "umap",
group.by = "predicted.ann_finest_level",
label = TRUE,
label.size = 4,
repel = TRUE) +
ggtitle("Azimuth annotation") +
theme(plot.title = element_text(hjust = 0.5)) +
NoLegend()
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
# Visualize annotation prediction score.
FeaturePlot(seurat_annotated,
reduction = "umap",
features = "mapping.score",
cols = c("#01013f", "#59974d", "#fff000")) +
theme(axis.text.x = element_text(size = 8),
axis.text.y = element_text(size = 8)) +
ggtitle("Annotation prediction score")
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
# Get metadata.
annotation_meta = seurat_annotated@meta.data
# Subset the required columns.
annotation_meta <- annotation_meta %>% subset(select = c(SCT_snn_res.0.2, predicted.ann_finest_level, mapping.score))
# Group the table by cluster and annotation.
annotation_meta <- annotation_meta %>%
dplyr::rename(cluster = "SCT_snn_res.0.2",
cluster_annotation = "predicted.ann_finest_level") %>%
group_by(cluster, cluster_annotation) %>%
summarise(mean_mapping_score = mean(mapping.score))
## `summarise()` has grouped output by 'cluster'. You can override using the
## `.groups` argument.
# Visualize the probable annotations for each cluster.
annotation_meta <- annotation_meta %>% arrange(cluster, desc(mean_mapping_score)) %>% slice_head(n = 3)
annotation_meta
## # A tibble: 54 × 3
## # Groups: cluster [18]
## cluster cluster_annotation mean_mapping_score
## <fct> <chr> <dbl>
## 1 0 Migratory DCs 0.852
## 2 0 Mast cells 0.841
## 3 0 T cells proliferating 0.750
## 4 1 Migratory DCs 0.924
## 5 1 Lymphatic EC mature 0.879
## 6 1 Classical monocytes 0.847
## 7 2 Plasmacytoid DCs 0.865
## 8 2 AT1 0.795
## 9 2 DC2 0.728
## 10 3 T cells proliferating 0.953
## # ℹ 44 more rows
# Remove unneeded object.
rm(annotation_meta)
We can cross-check the reference-based annotation with the ScType annotation for validation.
# Load in ScType functions.
source("/Users/kendrix/Documents/Work/UnityHealth/Data_Projects/auto_detect_tissue_type_KK.R")
source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/gene_sets_prepare.R")
source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/sctype_score_.R")
# Set path to ScType reference database.
db_ = "https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/ScTypeDB_full.xlsx"
# Check tissue type for validation.
tissue_guess <- auto_detect_tissue_type(path_to_db_file = db_,
seuratObject = seurat_cluster,
scaled = TRUE,
assay = "SCT")
## [1] "Checking...Immune system"
## Selecting by scores
## [1] "Checking...Pancreas"
## Selecting by scores
## [1] "Checking...Liver"
## Selecting by scores
## [1] "Checking...Eye"
## Selecting by scores
## [1] "Checking...Kidney"
## Selecting by scores
## [1] "Checking...Brain"
## Selecting by scores
## [1] "Checking...Lung"
## Selecting by scores
## [1] "Checking...Adrenal"
## Selecting by scores
## [1] "Checking...Heart"
## Selecting by scores
## [1] "Checking...Intestine"
## Selecting by scores
## [1] "Checking...Muscle"
## Selecting by scores
## [1] "Checking...Placenta"
## Selecting by scores
## [1] "Checking...Spleen"
## Selecting by scores
## [1] "Checking...Stomach"
## Selecting by scores
## [1] "Checking...Thymus"
## Selecting by scores
## [1] "Checking...Hippocampus"
## Selecting by scores
# Define Seurat object.
seuobj = seurat_cluster
# Specify tissue of interest.
tissue = "Lung"
# Prepare gene sets.
gs_list <- gene_sets_prepare(db_, tissue)
# Check Seurat object version - count matrix is extracted differently between versions.
seurat_package_v5 <- isFALSE("counts" %in% names(attributes(seuobj[["RNA"]])))
print(sprintf("Seurat object %s is used", ifelse(seurat_package_v5, "v5", "v4")))
## [1] "Seurat object v5 is used"
# Extract scaled scRNAseq matrix.
scRNAseqData_scaled <- if (seurat_package_v5) as.matrix(seuobj[["RNA"]]$scale.data) else
as.matrix(seuobj[["RNA"]]@scale.data)
# Run ScType.
es.max <- sctype_score(scRNAseqData = scRNAseqData_scaled,
scaled = TRUE,
gs = gs_list$gs_positive,
gs2 = gs_list$gs_negative)
# Merge ScType results by clusters.
cL_results <- do.call("rbind",
lapply(unique(seuobj@meta.data$seurat_clusters),
function(cl){es.max.cl = sort(
rowSums(
es.max[, rownames(
seuobj@meta.data[
seuobj@meta.data$seurat_clusters == cl, ])]),
decreasing = !0)
head(data.frame(cluster = cl,
type = names(es.max.cl),
scores = es.max.cl,
ncells = sum(seuobj@meta.data$seurat_clusters == cl)), 10)
}))
sctype_scores <- cL_results %>% group_by(cluster) %>% top_n(n = 1, wt = scores)
# Set low-confident (low ScType score) clusters to "Unknown".
sctype_scores$type[as.numeric(as.character(sctype_scores$scores)) < sctype_scores$ncells/4] <- "Unknown"
print(sctype_scores[, 1:3])
## # A tibble: 42 × 3
## # Groups: cluster [42]
## cluster type scores
## <fct> <chr> <dbl>
## 1 11 Immune system cells 1340.
## 2 8 Fibroblasts 16911.
## 3 18 Pulmonary alveolar type II cells 15768.
## 4 10 Fibroblasts 1470.
## 5 6 Unknown -78.9
## 6 3 Endothelial cell 9664.
## 7 21 Immune system cells 4369.
## 8 15 Alveolar macrophages 2727.
## 9 0 Clara cells 6486.
## 10 9 Unknown 230.
## # ℹ 32 more rows
# Add annotated cell types to the metadata.
seurat_cluster@meta.data$sctype_annotation = ""
for(j in unique(sctype_scores$cluster)){
cl_type = sctype_scores[sctype_scores$cluster==j, ]
seurat_cluster@meta.data$sctype_annotation[seurat_cluster@meta.data$seurat_clusters == j] = as.character(cl_type$type[1])
}
# Plot UMAP with ScType annotation.
DimPlot(seurat_cluster,
reduction = "umap",
label = TRUE,
label.size = 3,
repel = TRUE,
group.by = "sctype_annotation") +
NoLegend()
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
# Remove unneeded objects.
rm(auto_detect_tissue_type, cL_results, cl_type, db_, es.max,
gene_sets_prepare, gs_list, j, scRNAseqData_scaled, sctype_score,
sctype_scores, seuobj, seurat_package_v5, tissue, tissue_guess)
# Add metadata from seurat_annotated, which contains the Azimuth annotation and ScType annotation to seurat_cluster.
seurat_cluster@meta.data <- seurat_annotated@meta.data
# Get Ensembl annotation data.
# Connect to AnnotationHub.
ah <- AnnotationHub()
# Access the Ensembl database for organism.
ahDb <- query(ah,
pattern = c("Mus musculus", "EnsDb"),
ignore.case = TRUE)
# Acquire the latest annotation files.
id <- ahDb %>%
mcols() %>%
rownames() %>%
tail(n = 1)
# Download the appropriate Ensembldb database.
edb <- ah[[id]]
## loading from cache
# Extract gene-level information from database.
annotations <- genes(edb, return.type = "data.frame")
# Select annotations of interest.
annotations <- annotations %>%
dplyr::select(gene_id, gene_name, seq_name, gene_biotype, description)
# Remove unneeded objects.
rm(ah, ahDb, edb, id)
# Check unique cell type labels.
unique(seurat_cluster$predicted.ann_finest_level)
## [1] "Alveolar fibroblasts" "Adventitial fibroblasts"
## [3] "EC arterial" "B cells"
## [5] "EC venous systemic" "DC2"
## [7] "Classical monocytes" "Monocyte-derived Mφ"
## [9] "CD4 T cells" "AT1"
## [11] "EC general capillary" "Smooth muscle"
## [13] "Pericytes" "Non-classical monocytes"
## [15] "Transitional Club-AT2" "EC aerocyte capillary"
## [17] "Plasma cells" "AT2"
## [19] "Peribronchial fibroblasts" "CD8 T cells"
## [21] "Club (non-nasal)" "Multiciliated (non-nasal)"
## [23] "Lymphatic EC differentiating" "Migratory DCs"
## [25] "Ionocyte" "Plasmacytoid DCs"
## [27] "Interstitial Mφ perivascular" "T cells proliferating"
## [29] "EC venous pulmonary" "Myofibroblasts"
## [31] "Mast cells" "Goblet (nasal)"
## [33] "Lymphatic EC mature" "Alveolar macrophages"
## [35] "Alveolar Mφ proliferating" "NK cells"
## [37] "Neuroendocrine" "SM activated stress response"
## [39] "Suprabasal" "Club (nasal)"
## [41] "Basal resting"
# Remove any cells that are annotated as nasal.
grep("nasal", seurat_cluster$predicted.ann_finest_level, value = TRUE) %>% unique()
## [1] "Club (non-nasal)" "Multiciliated (non-nasal)"
## [3] "Goblet (nasal)" "Club (nasal)"
# Club (nasal), Goblet (nasal)
dim(seurat_cluster)[2] # 101,444
## [1] 101444
seurat_cluster <- subset(seurat_cluster,
subset = predicted.ann_finest_level %in% c("Club (nasal)",
"Goblet (nasal)"),
invert = TRUE)
dim(seurat_cluster)[2] # 101,423 (21 cells removed)
## [1] 101423
# Remove any cells that are labelled mesothelial cells as they are pleural and are not specialized lung cells.
grep("Mesothelium", seurat_cluster$predicted.ann_finest_level, value = TRUE) %>% unique()
## character(0)
# None
# Pull cell metadata from the Seurat object.
metadata <- seurat_cluster@meta.data
# Aggregate some annotated cell types.
# Find all fibroblasts and aggregate them together.
grep("fibroblast", metadata$predicted.ann_finest_level, value = TRUE) %>% unique()
## [1] "Alveolar fibroblasts" "Adventitial fibroblasts"
## [3] "Peribronchial fibroblasts" "Myofibroblasts"
# Alveolar fibroblasts, Adventitial fibroblasts, Peribronchial fibroblasts
metadata <- mutate(metadata,
predicted.ann_finest_level =
recode(predicted.ann_finest_level,
"Alveolar fibroblasts" = "Fibroblasts",
"Adventitial fibroblasts" = "Fibroblasts",
"Peribronchial fibroblasts" = "Fibroblasts"))
# Find all endothelial cells and aggregate them together.
grep("EC", metadata$predicted.ann_finest_level, value = TRUE) %>% unique()
## [1] "EC arterial" "EC venous systemic"
## [3] "EC general capillary" "EC aerocyte capillary"
## [5] "Lymphatic EC differentiating" "EC venous pulmonary"
## [7] "Lymphatic EC mature"
# EC arterial, EC venous systemic, EC general capillary, EC aerocyte capillary, Lympathic EC differentiating, EC venous pulmonary, Lympathic EC mature.
metadata <- mutate(metadata,
predicted.ann_finest_level =
recode(predicted.ann_finest_level,
"EC general capillary" = "Endothelial capillary",
"EC aerocyte capillary" = "Endothelial capillary",
"EC arterial" = "Endothelial arterial",
"EC venous systemic" = "Endothelial venous",
"EC venous pulmonary" = "Endothelial venous"))
metadata <- mutate(metadata,
predicted.ann_finest_level =
recode(predicted.ann_finest_level,
"Lymphatic EC mature" = "Endothelial lymphatic",
"Lymphatic EC differentiating" = "Endothelial lymphatic"))
# Find all smooth muscle cells and aggregate them together.
grep("Smooth|SM", metadata$predicted.ann_finest_level, value = TRUE) %>% unique()
## [1] "Smooth muscle" "SM activated stress response"
# Smooth muscle, SM activated stress response.
metadata <- mutate(metadata,
predicted.ann_finest_level =
recode(predicted.ann_finest_level,
"Smooth muscle" = "VSMCs",
"SM activated stress response" = "VSMCs"))
# Find all immune cells and aggregate them together.
metadata$predicted.ann_finest_level[metadata$predicted.ann_level_1 == "Immune"] %>% unique()
## [1] "B cells" "DC2"
## [3] "Classical monocytes" "Monocyte-derived Mφ"
## [5] "CD4 T cells" "Fibroblasts"
## [7] "Non-classical monocytes" "Endothelial venous"
## [9] "Plasma cells" "CD8 T cells"
## [11] "Pericytes" "Migratory DCs"
## [13] "Plasmacytoid DCs" "Interstitial Mφ perivascular"
## [15] "Club (non-nasal)" "Ionocyte"
## [17] "Endothelial capillary" "T cells proliferating"
## [19] "Mast cells" "VSMCs"
## [21] "Endothelial arterial" "Multiciliated (non-nasal)"
## [23] "AT1" "AT2"
## [25] "Alveolar macrophages" "Alveolar Mφ proliferating"
## [27] "NK cells" "Endothelial lymphatic"
## [29] "Basal resting"
# B cells, DC2, Classical monocytes, Monocyte-derived Mφ, CD4 T cells, Non-classical monocytes, Plasma cells, CD8 T cells, Migratory DCs, Plasmacytoid DCs, Interstitial Mφ perivascular, T cells proliferating, Mast cells, Alveolar macrophages, Alveolar Mφ proliferating, NK cells.
metadata <- mutate(metadata,
predicted.ann_finest_level =
recode(predicted.ann_finest_level,
"B cells" = "Immune",
"DC2" = "Immune",
"Classical monocytes" = "Immune",
"Monocyte-derived Mφ" = "Immune",
"CD4 T cells" = "Immune",
"Non-classical monocytes" = "Immune",
"Plasma cells" = "Immune",
"CD8 T cells" = "Immune",
"Migratory DCs" = "Immune",
"Plasmacytoid DCs" = "Immune",
"Interstitial Mφ perivascular" = "Immune",
"T cells proliferating" = "Immune",
"Mast cells" = "Immune",
"Alveolar macrophages" = "Immune",
"Alveolar Mφ proliferating" = "Immune",
"NK cells" = "Immune"))
# Get metadata column and rename.
metadata = (metadata %>%
subset(select = -cell_type) %>%
rename("predicted.ann_finest_level" = "cell_type"))
# Add edited metadata back to Seurat object.
seurat_cluster@meta.data <- metadata
# Set default idents of Seurat object.
Idents(seurat_cluster) <- "cell_type"
# Set colour scheme based on annotated cell type.
celltype_col = c("AT1" = "#2f4f4f",
"AT2" = "#00bfff",
"Basal resting" = "#2e8b57",
"Club (non-nasal)" = "#8b008b",
"Endothelial arterial" = "#ffa500",
"Endothelial capillary" = "#808000",
"Endothelial lympathic" = "#00008b",
"Endothelial venous" = "#00ff00",
"Fibroblasts" = "#ff4500",
"Immune" = "#f08080",
"Ionocyte" = "#f0e68c",
"Multiciliated (non-nasal)" = "#ff00ff",
"Myofibroblasts" = "#c71585",
"Neuroendocrine" = "#00fa9a",
"Pericytes" = "#ff1493",
"Suprabasal" = "#eee8aa",
"Transitional Club-AT2" = "#9370db",
"VSMCs" = "#00ffff")
# Visualize annotated cluster with a UMAP.
DimPlot(seurat_cluster,
reduction = "umap",
group.by = "cell_type",
cols = celltype_col,
label = TRUE,
label.size = 4,
repel = TRUE,
raster = FALSE) +
ggtitle("Final cell type annotation") +
theme(plot.title = element_text(hjust = 0.5))
save(seurat_cluster,
file = "/Users/kendrix/Documents/Work/UnityHealth/Data_Projects/scRNAseq_LungFibrosisAnalysis/GSE250396/data/preprocessed_seurat_obj.RData")
Analysis codes are adapted from HBCtraining and Ouyang Lab with credits going to the original authors of the publications cited in the book. Reference materials are adapted from SVI Bioinformatics and Cellular Genomics Lab with credits going to the original authors.